kSanity-VIRGO metagenomics counts QC

Author

Laura Vermeren & Symul

Published

May 2, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(labelled)
library(vegan)
library(RColorBrewer)

# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the data

Code
data_source <- "real"
data_dir <- get_data_dir(data_source)

if (data_source == TRUE) {
  mg_dir <- str_c(data_dir, "03 metagenomics combined/")
  counts <- read_csv(str_c(mg_dir, "mg_combined.csv"))
  manifest <- read_csv(str_c(mg_dir, "mg_combined_manifest.csv"))
} else {
  mg_dir <- str_c(data_dir, "02 Metagenomics/")
  counts <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_ReadCounts_20250404.csv"))
  counts_corr <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_taxaGLcor_20250404.csv"))
  relabs <- read_csv(str_c(mg_dir, "MVIBR_kSanityVIRGO2_RelAbund_20250404.csv"))
  technical_metadata <- read_csv(str_c(mg_dir, "VIBRANT_MG_technicalMetaData_20250404.csv"))
  LBP_strain_info <- readxl::read_xlsx(str_c(data_dir, "00 Trial Data/IsolateNumbers.xlsx"))
  VIRGO2_taxonomic_table <- read_csv(str_c(mg_dir, "VIRGO2_taxonomy_key_250429.csv"))
}

We load the following tables, all provided by Michael France:

  • counts raw counts per LBP strain and taxa (dimensions: 966 samples x 783 columns)
  • counts_corr counts per LBP strain and taxa corrected by gene length (dimensions: 966 samples x 782 columns)
  • relabs relative abundances per LBP strain and taxa, computed by M. France from counts_corr (dimensions: 966 samples x 782 columns)
  • technical_metadata technical metadata (dimensions: 1127 samples x 19 columns)
  • VIRGO2_taxonomic_table taxonomic table for the taxa as defined in VIRGO2 (dimensions: 779 taxa x 10 columns)

We note that some samples have been sequenced twice to increase the coverage. The technical metadata contains a column LibrarySequencedTwice that indicates whether a sample has been sequenced twice.

Code
sample_sequences_twice <- 
technical_metadata |> 
  group_by(UID) |> 
  mutate(n = n()) |> 
  ungroup() |> 
  arrange(-n) |> 
  filter(n > 1) 

There are 161 samples that have been sequenced twice.

Code
sample_sequences_twice |> 
  select(UID, SampleType, Ext_Lib_Plate, SequencingRun, BioinformaticsProcessingBatch, `Selected4re-extraction`, LibrarySequencedTwice, Notes) |> 
  gt(caption = "Samples that have been sequenced twice") 

Note: once we have participants/visits info, we can check whether there are any systematic patterns in these samples.

We note some negative (although extremely small) values in the counts, counts_corr, and rel_abs assays (all for native crispatus, I assume it being an artifact from kSANITY). We will set these to 0.

We also load LBP_strain_info from the VIBRANT Dropbox that contains information on the LBP strains

The names of the strains do not correspond exactly to those on the VIBRANT Dropbox folder. We modify them to match those provided by Michael.

Code
LBP_strain_info <- 
  LBP_strain_info |> 
  mutate(
    strain_id = `VMRC ID`, 
    LBP = ifelse(is.na(LC106), "LC-115", "LC-106 & LC-115") |> factor(), 
    strain_origin = `Geographic source` |> factor()
  ) |>
  dplyr::rename(Biose_ID = `Biose ID`) |> #VMRC_ID = `VMRC ID` 
  dplyr::select(strain_id, LBP, strain_origin, Biose_ID) |> 
  arrange(strain_origin, LBP) |> 
  mutate(strain_id = strain_id |> fct_inorder()) |> 
  dplyr::select(strain_id, LBP, strain_origin, contains("ID")) |> 
  mutate(
    strain_id_mg = sub("_.*", "", strain_id),
    strain_id_mg = 
      case_when(
        strain_id_mg == "CC0028A1" ~ "C0028A1", 
        TRUE ~ strain_id_mg
      )
  )

LBP_strain_info |>   
  set_variable_labels(
    strain_id = "Strain ID", 
    strain_id_mg = "Strain ID (as in metagenomics data)",
    strain_origin = "Strain origin", 
    Biose_ID = "Biose ID"
  ) |> 
  gt(caption = "LBP strain information") |> 
  tab_style(style = cell_text(weight = "bold"),
            locations = cells_column_labels())
LBP strain information
Strain ID LBP Strain origin Biose ID Strain ID (as in metagenomics data)
FF00018 LC-106 & LC-115 SA GA08 FF00018
FF00051 LC-106 & LC-115 SA GA09 FF00051
UC101_2_33 LC-106 & LC-115 SA GA12 UC101
FF00004 LC-115 SA GA07 FF00004
FF00064 LC-115 SA GA10 FF00064
FF00072 LC-115 SA GA11 FF00072
UC119_2_PB0238 LC-115 SA GA13 UC119
122010_1999_16 LC-115 SA GA14 122010
185329_1999_17 LC-115 SA GA15 185329
C0059E1 LC-106 & LC-115 US GA03 C0059E1
C0022A1 LC-106 & LC-115 US GA04 C0022A1
C0175A1 LC-106 & LC-115 US GA06 C0175A1
C0006A1 LC-115 US GA01 C0006A1
CC0028A1 LC-115 US GA02 C0028A1
C0112A1 LC-115 US GA05 C0112A1

Creating a SummarizedExperiment object from these tables

The SummarizedExperiment object will contain the following assays:

  • counts
  • counts_corr
  • rel_abs

The SE colData will contain the technical metadata, the total number of reads for each sample, along with the metagenomics-based CST, subCST, and VALENCIA’s assignment scores. For samples that have been sequenced twice, we will merge the sequencing and processing information from both sequencing runs.

The SE rowData will contain the VIRGO2 taxonomic table, augmented with the LBP strain information.

NOTE : At the moment, the table counts includes an extra column “multiGenera”. For now, we remove the MuliGenera from the counts data frame; but in the future, Michael should provide the corrected counts and relative abundances with this extra column.

Code
mg_to_SE <- function(counts, counts_corr, relabs, LBP_strain_info, technical_metadata, VIRGO2_taxonomic_table) {
  
  technical_metadata_uid <- 
    technical_metadata |> 
    group_by(UID) |> 
    summarise(
      across(everything(), ~ unique(.) |> na.omit() |> str_c(collapse = "; ")),
    ) |> 
    mutate(
      LibrarySequencedTwice = ifelse(LibrarySequencedTwice == 1, TRUE, FALSE),
      Ext_Lib_Plate = Ext_Lib_Plate |> as.integer()
    ) 
  
  # remove "multiGenera" from counts # TODO: change later
  counts <- 
    counts |> 
    select(-c(MultiGenera))

  assay_counts <- 
    counts |> 
    dplyr::select(-c(CST, subCST, score)) |> 
    as.data.frame() |> 
    column_to_rownames("sampleID") |> 
    drop_na() |> 
    t() |> 
    pmax(0)
  
  assay_counts_corr <- 
    counts_corr |> 
    dplyr::select(-c(CST, subCST, score)) |>
    as.data.frame() |>
    column_to_rownames("sampleID") |>
    t() |> 
    pmax(0)
  
  assay_relative_ab <- 
    relabs |> 
    dplyr::select(-c(CST, subCST, score)) |>
    as.data.frame() |> 
    column_to_rownames("sampleID") |>
    t() |> 
    pmax(0)
    
  
  # include a total reads per sample in the colData 
  se_coldata <-
    counts |> 
    select(sampleID, CST, subCST, score) |>
    mutate(
      total_non_human_reads = rowSums(counts |> select(-c(sampleID, CST, subCST, score)))
      ) |> 
    select(sampleID, total_non_human_reads, everything()) |> 
    left_join(
      technical_metadata_uid,
      by = c("sampleID" = "UID")
    ) |>
    as.data.frame()
  
  se_rowdata <- 
    tibble(
      taxon_id = 
        counts |> 
        dplyr::select(-c(sampleID, CST, subCST, score)) |>
        colnames()
    ) |> 
    left_join(
      VIRGO2_taxonomic_table |> 
        mutate(
          taxon_id = Taxa,
          last_available_taxonomic_level = 
            Full_taxonomy |> str_remove_all(".*;") |> str_remove_all("_.*")
        ) |> 
        relocate(Full_taxonomy, .after = Species),
      by = join_by(taxon_id)
    ) |> 
    left_join(
      LBP_strain_info |> 
        dplyr::rename(taxon_id = strain_id_mg),
      by = join_by(taxon_id)
    ) |> 
    mutate(
      taxon_label = 
        ifelse(!is.na(LBP), "LBP strain ", "") |> 
        str_c(taxon_id |> str_replace_all("_", " ")) |> 
        str_c(
          ifelse(
            last_available_taxonomic_level %in% c("g","f"), 
            str_c(" (", last_available_taxonomic_level, ")"), ""
          )
        ) |> 
        str_c(
          ifelse(taxon_id == "Lactobacillus_crispatus", " (native strains)", "")
        )
    ) |> 
    relocate(taxon_label, .after = taxon_id) |>
    as.data.frame() 
    
  # Harmonization of the order of samples and feature
  
  ordered_sample_ids <- se_coldata$sampleID
  assay_counts <- assay_counts[, ordered_sample_ids]
  assay_counts_corr <- assay_counts_corr[, ordered_sample_ids]
  assay_relative_ab <- assay_relative_ab[, ordered_sample_ids]

  ordered_taxa <- se_rowdata$taxon_id
  assay_counts <- assay_counts[ordered_taxa, ]
  assay_counts_corr <- assay_counts_corr[ordered_taxa, ]
  assay_relative_ab <- assay_relative_ab[ordered_taxa, ]

  SummarizedExperiment::SummarizedExperiment(
    assays = list(counts = assay_counts, counts_corr = assay_counts_corr, rel_abs = assay_relative_ab),
    rowData = se_rowdata,
    colData = se_coldata
  )
}
Code
SE_mg <- mg_to_SE(counts, counts_corr, relabs, LBP_strain_info, technical_metadata, VIRGO2_taxonomic_table)

Exploratory & QC analyses

Code
SE_mg
# A SummarizedExperiment-tibble abstraction: 751,548 × 45
# Features=778 | Samples=966 | Assays=counts, counts_corr, rel_abs
   .feature .sample   counts counts_corr rel_abs sampleID total_non_human_reads
   <chr>    <chr>      <dbl>       <dbl>   <dbl> <chr>                    <dbl>
 1 122010   MG_1010       0           0   0      MG_1010               9671181.
 2 185329   MG_1010       0           0   0      MG_1010               9671181.
 3 C0006A1  MG_1010       0           0   0      MG_1010               9671181.
 4 C0022A1  MG_1010 1408234.       1494.  0.137  MG_1010               9671181.
 5 C0028A1  MG_1010       0           0   0      MG_1010               9671181.
 6 C0059E1  MG_1010  268235.        285.  0.0261 MG_1010               9671181.
 7 C0112A1  MG_1010       0           0   0      MG_1010               9671181.
 8 C0175A1  MG_1010 6571758.       6971.  0.639  MG_1010               9671181.
 9 FF00004  MG_1010       0           0   0      MG_1010               9671181.
10 FF00018  MG_1010       0           0   0      MG_1010               9671181.
# ℹ 40 more rows
# ℹ 38 more variables: CST <chr>, subCST <chr>, score <dbl>, SampleType <chr>,
#   Ext_Lib_Plate <int>, Ext_Lib_Plate_ID <chr>, Ext_Lib_Position <chr>,
#   SequencingRun <chr>, DateSequenced <chr>, Lane <chr>, Sample <chr>,
#   `Library Pool` <chr>, Library <chr>, FragmentSize <chr>, IndexSet <chr>,
#   `Index 1` <chr>, `Index 2` <chr>, BioinformaticsProcessingBatch <chr>,
#   `Selected4re-extraction` <chr>, LibrarySequencedTwice <lgl>, Notes <chr>, …

Total number of counts/relative abundances per sample

Total number of non-human reads per sample

Code
SE_mg |> 
  colData() |> 
  as_tibble() |>
  ggplot() +
  aes(x = total_non_human_reads, fill = LibrarySequencedTwice) +
  geom_histogram(bins = 50) +
  scale_x_log10() +
  facet_grid(SampleType ~ ., scale = "free") +
  labs(
    x = "Total number of non-human reads per sample", 
    y = "Number of samples"
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
SE_mg |> 
  colData() |> 
  as_tibble()  |> 
  arrange(total_non_human_reads) |> 
  mutate(index = row_number()) |>
  select(index, sampleID, total_non_human_reads, SampleType, LibrarySequencedTwice, subCST) |> 
  filter(total_non_human_reads < 1e6) |>
  gt()
index sampleID total_non_human_reads SampleType LibrarySequencedTwice subCST
1 MG_EQ05822476 2 Nuclease-free water FALSE IV-A
2 MG_2179903 107164 ClinicalSample FALSE III-B
3 MG_EQ05822493 111049 Unused swab+C2 FALSE III-B
4 MG_2694 122821 ClinicalSample FALSE III-B
5 MG_EQ05822499 122897 Nuclease-free water FALSE IV-B
6 MG_2196625 211151 ClinicalSample FALSE III-B
7 MG_2309505 284381 ClinicalSample FALSE III-B
8 MG_3968 303568 ClinicalSample FALSE IV-B
9 MG_2287336 380193 ClinicalSample FALSE IV-B
10 MG_2193899 425144 ClinicalSample TRUE III-B
11 MG_2250465 429267 ClinicalSample FALSE IV-C1
12 MG_3344 446679 ClinicalSample FALSE III-B
13 MG_5747 469552 ClinicalSample FALSE III-A
14 MG_2285838 528459 ClinicalSample FALSE III-B
15 MG_3458 548033 ClinicalSample FALSE III-B
16 MG_2311303 549857 ClinicalSample FALSE III-B
17 MG_2304216 580999 ClinicalSample TRUE IV-A
18 MG_1588 618230 ClinicalSample FALSE IV-B
19 MG_EQ05822501 635475 Nuclease-free water FALSE III-B
20 MG_2265171 722162 ClinicalSample FALSE I-B
21 MG_2289558 725940 ClinicalSample FALSE III-B
22 MG_2304663 733772 ClinicalSample FALSE IV-B
23 MG_972 767569 ClinicalSample FALSE III-A
24 MG_2192710 795144 ClinicalSample TRUE IV-C1
25 MG_2251190 805893 ClinicalSample FALSE IV-B
26 MG_4112 819924 ClinicalSample TRUE IV-B
27 MG_3146 822383 ClinicalSample FALSE III-B
28 MG_2289564 846815 ClinicalSample FALSE IV-B
29 MG_EQ05822500 857771 Unused swab+C2 FALSE III-B
30 MG_2240695 866059 ClinicalSample FALSE IV-B
31 MG_1291 878598 ClinicalSample FALSE III-A
32 MG_3282 882023 ClinicalSample FALSE IV-B
33 MG_4825 925127 ClinicalSample FALSE IV-B
34 MG_2287920 930056 ClinicalSample FALSE IV-B
35 MG_1507 932868 ClinicalSample FALSE III-B
36 MG_2268611 958264 ClinicalSample FALSE III-B
37 MG_2242291 996799 ClinicalSample TRUE IV-B

TODO: check if correlated with post-MTZ visit

Total relative abundances per sample

We check that the relative abundances sum to 1 for each sample.

Code
tol <- 1e-5

SE_mg |> 
  as_tibble() |> 
  group_by(.sample) |>
  summarise(total_rel_abs = sum(rel_abs)) |> 
  ggplot(aes(x = total_rel_abs)) +
  geom_histogram(binwidth = tol) + 
  labs(x = "Total relative abundances per sample", 
       y = "Number of samples") +
  scale_x_continuous(limits = 1 + c(-1, 1) * 5 * tol) 

Code
rm(tol)
Code
# We check the relative abundances computation (should be `counts_corr/sum(counts_corr)`).

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  group_by(.sample) |>
  mutate(rel_abs_check = counts_corr/sum(counts_corr)) |> 
  ungroup() 

if (max(tmp$rel_abs_check - tmp$rel_abs) > 0.01) warning("!! Relative abundances do not match the computed values !!")

Non-bacterial DNA

There are some non-bacterial taxa in the dataset:

Code
SE_mg |> 
  as_tibble() |> 
  filter(Category != "Bacteria") |> 
  group_by(.sample, Category) |> 
  summarize(rel_abs = sum(rel_abs), .groups = "drop") |> 
  ggplot() +
  aes(x = rel_abs, fill = Category) +
  geom_histogram(binwidth = 0.001) +
  facet_wrap(~ Category, scales = "free") +
  scale_x_continuous("Total relative abundance per sample for that group", labels = scales::percent_format()) +
  ylab("Number of samples") +
  guides(fill = "none")

Code
SE_mg |> 
  as_tibble() |> 
  filter(Category != "Bacteria") |> 
  group_by(.sample) |>
  mutate(total_rel_ab = sum(rel_abs)) |>
  ungroup() |> 
  arrange(-total_rel_ab, -rel_abs) |> 
  mutate(
    .sample = .sample |> fct_inorder(),
    taxon_label = taxon_label |> fct_inorder()
  ) |> 
  filter(as.numeric(.sample) <= 20) |>
  ggplot() +
  aes(x = rel_abs, y = .sample |> fct_rev(), fill = Category) +
  geom_col() +
  scale_x_continuous("Relative abundance", labels = scales::percent_format()) +
  ylab("Top 20 samples with most non-bacterial relative abundances") 

Code
SE_mg |> 
  as_tibble() |> 
  filter(Category != "Bacteria") |> 
  group_by(taxon_label) |>
  mutate(total_rel_ab = sum(rel_abs), max_rel_ab = max(rel_abs)) |>
  ungroup() |> 
  arrange(-total_rel_ab, -rel_abs) |> 
  mutate(
    .sample = .sample |> fct_inorder(),
    taxon_label = taxon_label |> fct_inorder()
  ) |> 
  filter(as.numeric(taxon_label) <= 20, max_rel_ab > 0.001) |>
  ggplot() +
  aes(x = rel_abs, y = taxon_label |> fct_rev(), color = Category) +
  geom_point(alpha = 0.7) +
  scale_x_continuous("Relative abundance", labels = scales::percent_format()) +
  ylab("Top non-bacterial organisms") +
  theme(strip.text.y = element_text(angle = 0)) +
  labs(caption = "Each dot is a sample.")

Consequently, we create a new assay which contains the relative abundances of the bacteria only.

Code
SE_mg <- 
  SE_mg |>
  mutate(
    rel_abs_bact = rel_abs * (!is.na(Category) & (Category == "Bacteria"))
  )

assay(SE_mg, "rel_abs_bact") <- t(t(assay(SE_mg, "rel_abs_bact"))/colSums(assay(SE_mg, "rel_abs_bact")))

Control samples

There are 4 categories of control samples:

Code
SE_mg |> 
  colData() |> 
  as_tibble() |> 
  filter(SampleType != "ClinicalSample") |>
  select(SampleType, sampleID, Ext_Lib_Plate) |> 
  group_by(SampleType) |> 
  arrange(SampleType, Ext_Lib_Plate) |>
  gt(caption = "Control samples", row_group_as_column = TRUE) 
Control samples
sampleID Ext_Lib_Plate
Mock 1 MG_EQ05822515 1
MG_EQ05822481 5
MG_EQ05822509 8
MG_EQ05822503 10
MG_EQ05822445 11
Mock 2 MG_EQ05822227 1
MG_EQ05822473 3
MG_EQ05822497 10
MG_EQ05822439 11
Nuclease-free water MG_EQ05822226 1
MG_EQ05822476 3
MG_EQ05822499 5
MG_EQ05822501 8
MG_EQ05822491 10
Unused swab+C2 MG_EQ05822202 1
MG_EQ05822493 5
MG_EQ05822500 8
MG_EQ05822485 10

These control samples have been created such that

  • Mock 1: pooled patient samples from CAP084 swabs

    • 50 non-BV
      • VMRC 15 L.crispatus isolates
      • 12 L.crispatus dominant samples
      • 23 L. iners dominant samples
    • 50 BV
      • 24 G. vaginalis dominant samples
      • BV bacteria, not G. vaginalis dominant
  • Mock 2: Pooled pure isolates (equal volumes of pure isolates at OD 0.1)

    • VMRC 15 L.crispatus isolates
    • G. vaginalis ATCC
    • A. vaginae ATCC
    • P. bivia ATCC
    • L. crispatus ATCC
    • L. gasseri ATCC
    • S. agalactiae ATCC
    • L. jensenii ATCC
    • F. magna

Taxonomic composition of these control samples:

Code
get_taxa_colors <- function(taxa) {
  tibble(
    taxa = taxa
  ) |> 
    mutate(
      cat = 
        case_when(
          taxa == "Other" ~ "Other",
          str_detect(taxa, "crispatus") ~ "crispatus",
          str_detect(taxa, "LBP") ~ "LBP",
          str_detect(taxa, "iner") ~ "iners",
          str_detect(taxa, "revotella") ~ "prevotella",
          str_detect(taxa, "ardnerella") ~ "gardnerella",
          TRUE ~ "non lacto"
        )
    ) |> 
    group_by(cat) |>
    mutate(
      color = 
        case_when(
          taxa == "Other" ~ "gray80",
          str_detect(taxa, "crispatus") ~ "orange",
          str_detect(taxa, "LBP") ~ colorRampPalette(c("orangered1", "red4"))(n()),
          str_detect(taxa, "iner") ~ "green3",
           str_detect(taxa, "jensenii") ~ "green4",
          str_detect(taxa, "ardnerella") ~ colorRampPalette(c("dodgerblue1", "dodgerblue4"))(n()),
          str_detect(taxa, "revotella") ~ colorRampPalette(c("purple1", "purple4"))(n()),
          TRUE ~ colorRampPalette(c("turquoise3", "black"))(n())
        )
    ) |> 
    pull(color)
}
Code
control_types <- SE_mg$SampleType |> unique() |> sort() |> extract(-1)
Code
map(
  control_types,
  ~ {
    tmp <- 
      SE_mg |> 
      as_tibble() |> 
      filter(SampleType == .x) |> 
      filter(rel_abs > 0) 
    top_taxa <- 
      tmp |> 
      group_by(taxon_label) |> 
      summarise(total_relabs = sum(rel_abs)) |> 
      arrange(desc(total_relabs)) |> 
      slice_head(n = 25) |> 
      pull(taxon_label) |> sort()
    
    all_taxa <- tmp$taxon_label |> unique() |> sort()
    
    tmp |> 
      ggplot() +
      aes(y = rel_abs, x = .sample, fill = taxon_label) +
      geom_col(color = "white", linewidth = 0.1) +
      scale_x_discrete("Samples") +
      scale_y_continuous("Rel. ab.") +
      scale_fill_discrete(
        "Top taxa", breaks = top_taxa #, limits = all_taxa, values = get_taxa_colors(all_taxa)
        ) + 
      facet_grid(. ~ str_c("Plate ", Ext_Lib_Plate), scales = "free") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      labs(title = str_c("Control samples : ", .x))
  }
)
[[1]]


[[2]]


[[3]]


[[4]]

And it looks like the replicability across controls (plates?) is not very good.

Mock 1

We see that the relative abundances are not very replicable, but let’s check if detection (presence/absence) is better.

Code
tmp <- 
  SE_mg |> 
  as_tibble() |> 
  filter(SampleType == "Mock 1") |> 
  mutate(present = (rel_abs > 1/1000)) |> 
  group_by(taxon_label) |>
  mutate(tot_prop = sum(rel_abs)) |> 
  ungroup() |> 
  arrange(-tot_prop) |> 
  mutate(taxon_label = taxon_label |> fct_inorder()) 
Code
tmp |> 
  ggplot() +
  aes(x = taxon_label, y = .sample, fill = rel_abs |> log10()) +
  geom_tile() +
  ylab("") +
  scale_x_discrete("Taxa", breaks = NULL) +
  scale_fill_gradient(low = "gray99", high = "red", na.value = "white")

Mock 2

In the Mock 2, we check for the proportion of species/taxa that were not expected based on the theoretical composition of Mock 2

Code
# Focus on Mock2

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  filter(SampleType == "Mock 2") |> 
  filter(rel_abs > 0) |> 
  mutate(
    expected = 
      str_detect(taxon_label, "LBP") | 
      str_detect(taxon_label, "Gardnerella") |
      str_detect(taxon_label, "Fannyhessea") |
      str_detect(taxon_label, "bivia") |
      str_detect(taxon_label, "crispatus") |
      str_detect(taxon_label, "gasseri") |
      str_detect(taxon_label, "jensenii") |
      str_detect(taxon_label, "Streptococcus agalactiae") |
      str_detect(taxon_label, "Finegoldia magna") 
  )

tmp |> 
  ggplot() +
  aes(y = rel_abs, x = .sample, fill = expected) +
  geom_col(color = "white", linewidth = 0.1) +
  scale_x_discrete("Samples") +
  scale_y_continuous("Rel. ab.") + 
  facet_grid(. ~ str_c("Plate ", Ext_Lib_Plate), scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
tmp |> 
  group_by(.sample, Ext_Lib_Plate, expected) |> 
  summarize(tot_rel_ab_pc = round(100 * sum(rel_abs), 2), .groups = "drop") |>
  filter(!expected) |> 
  gt(caption = "Percent of relative abundance with unexpected taxa")
Percent of relative abundance with unexpected taxa
.sample Ext_Lib_Plate expected tot_rel_ab_pc
MG_EQ05822227 1 FALSE 4.53
MG_EQ05822439 11 FALSE 3.53
MG_EQ05822473 3 FALSE 1.83
MG_EQ05822497 10 FALSE 3.13

When only considering bacterial relative abundances, results look similar:

Code
# Focus on Mock2

tmp <- 
  SE_mg |> 
  as_tibble() |> 
  filter(SampleType == "Mock 2") |> 
  filter(rel_abs_bact > 0) |> 
  mutate(
    expected = 
      str_detect(taxon_label, "LBP") | 
      str_detect(taxon_label, "Gardnerella") |
      str_detect(taxon_label, "Fannyhessea") |
      str_detect(taxon_label, "bivia") |
      str_detect(taxon_label, "crispatus") |
      str_detect(taxon_label, "gasseri") |
      str_detect(taxon_label, "jensenii") |
      str_detect(taxon_label, "Streptococcus agalactiae") |
      str_detect(taxon_label, "Finegoldia magna") 
  )

tmp |> 
  ggplot() +
  aes(y = rel_abs_bact, x = .sample, fill = expected) +
  geom_col(color = "white", linewidth = 0.1) +
  scale_x_discrete("Samples") +
  scale_y_continuous("Rel. ab. (of bacterial content)") + 
  facet_grid(. ~ str_c("Plate ", Ext_Lib_Plate), scales = "free") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code
tmp |> 
  group_by(.sample, Ext_Lib_Plate, expected) |> 
  summarize(tot_rel_ab_pc = round(100 * sum(rel_abs_bact), 2), .groups = "drop") |>
  filter(!expected) |> 
  gt(caption = "Percent of relative abundance with unexpected taxa")
Percent of relative abundance with unexpected taxa
.sample Ext_Lib_Plate expected tot_rel_ab_pc
MG_EQ05822227 1 FALSE 4.52
MG_EQ05822439 11 FALSE 3.50
MG_EQ05822473 3 FALSE 1.82
MG_EQ05822497 10 FALSE 3.13

It could be annoying that we are so close to the detection threshold for the primary outcomes definition.

Alpha-diversity of clinical samples and controls

Code
vegan::diversity(SE_mg |> assay("rel_abs"), index = "shannon", MARGIN = 2) |> 
  as_tibble() |> 
  mutate(sampleID = colnames(SE_mg)) |> 
  left_join(SE_mg |> colData() |> as_tibble(), by = join_by(sampleID)) |> 
  ggplot() +
  aes(x = SampleType, y = value |> exp(), col = SampleType, fill = SampleType) +
  geom_violin(scale = "width", color = NA, alpha = 0.2) +
  ggbeeswarm::geom_quasirandom(size = 0.5, alpha = 0.5) +
  scale_y_log10() +
  labs(
    x = "",
    y = "Effective number of species based on shannon diversity index"
  ) +
  guides(fill = "none", color = "none") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Negative controls have lower total reads and lower alpha-diversity.

Proportion of LBP strains per sample

Do we detect any LBP strain?

Code
SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP)) |> 
  ggplot() +
  aes(x = rel_abs, fill = (rel_abs == 0)) +
  geom_histogram(binwidth = 0.01) +
  scale_fill_manual("", values = c("steelblue", "gray"), labels = c("Rel. ab = 0", "Rel. ab > 0")) +
  xlab("Relative abundance of LBP strains") +
  ylab("Number of samples x LBP strain")

Code
SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP), rel_abs > 0) |> 
  ggplot() +
  aes(x = rel_abs) +
  geom_histogram(binwidth = 0.001) +
  xlab("Non-zero relative abundance of LBP strains") +
  ylab("Number of samples") +
  facet_grid(LBP + .feature ~ .) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP), rel_abs > 0) |>
  ggplot() +
  aes(x = .feature) +
  geom_bar() +
  scale_x_discrete("LBP strains") +
  scale_y_continuous("Number of samples with non-zero relative abundances") +
  facet_grid(. ~ LBP, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Code
number_of_non_zero_LBP_strains_per_sample <- 
  SE_mg |> 
  as_tibble() |>  
  filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    number_of_non_zero_LBP_strains = sum(rel_abs > 0),
  ) 

number_of_non_zero_LBP_strains_per_sample |> 
  ggplot() +
  aes(x = number_of_non_zero_LBP_strains) +
  geom_histogram(binwidth = 0.5) +
  scale_x_continuous("Number of non-zero LBP strains per sample", breaks = 0:15, minor_breaks = 0) +
  ylab("Number of samples") 

There are 234 samples (24.22%) with at least one LBP strain detected (non-zero relative abundance).

Code
SE_mg |> 
  as_tibble() |>  
  filter(!is.na(LBP)) |> 
  left_join(
    number_of_non_zero_LBP_strains_per_sample,
    by = join_by(.sample)
  ) |>
  filter(number_of_non_zero_LBP_strains != 0) |>
  group_by(.feature) |>
  mutate(tot_rel_abs_for_strain = sum(rel_abs)) |> 
  ungroup() |> 
  arrange(LBP, -tot_rel_abs_for_strain) |>
  mutate(.feature = .feature |> fct_inorder()) |> 
  group_by(.sample) |> 
  mutate(
    LBP_rel_abs = rel_abs / sum(rel_abs),
    score = weighted.mean(LBP_rel_abs, .feature |> as.numeric()),
    total_LBP_rel_abs = sum(rel_abs)
  ) |> 
  ungroup() |> 
  arrange(score) |> 
  mutate(.sample = .sample |> fct_inorder()) |> 
  ggplot() +
  aes(x = .feature, y = .sample, fill = rel_abs) +
  geom_tile() +
  scale_fill_continuous("Rel. ab", low = "white", high = "steelblue2") +
  scale_x_discrete("Strains") +
  scale_y_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ LBP + strain_origin, scales = "free_x", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(
    caption = "Rel. ab. of LBP strains in samples with at least one LBP strain detected (non-zero)",
  )

Code
samples_with_only_one_LBP_strain <- 
  number_of_non_zero_LBP_strains_per_sample |> 
  filter(number_of_non_zero_LBP_strains == 1) |> 
  pull(.sample)

SE_mg |> 
  as_tibble() |> 
  filter(!is.na(LBP)) |> 
  left_join(number_of_non_zero_LBP_strains_per_sample, by = ".sample") |>
  filter(number_of_non_zero_LBP_strains != 0) |> 
  ggplot() +
  aes(x = rel_abs, fill = number_of_non_zero_LBP_strains |> factor()) +
  geom_histogram() +
  facet_grid(number_of_non_zero_LBP_strains ~ taxon_label) +
  theme(strip.text.y = element_text(angle = 0))

Compositions

Code
summarized_rel_abs <- 
  SE_mg |> 
  as_tibble() |> 
  group_by(.feature) |> 
  mutate(max_rel_ab = max(rel_abs), mean_rel_ab = mean(rel_abs)) |> 
  ungroup() |> 
  mutate(
    taxa = 
      case_when(
        !is.na(LBP) ~ taxon_label,
        max_rel_ab > 1/3 ~ taxon_label,
        TRUE ~ "Other"
      ),
    is_lacto = str_detect(Genus, "Lactobacillus") & (taxa != "Other")
  ) |> 
  arrange(LBP, is_lacto, desc(max_rel_ab)) |>
  mutate(
    taxa = taxa |> fct_inorder()
  ) |> 
  group_by(.sample, SampleType, taxa, LBP, is_lacto) |>
  summarize(rel_abs = sum(rel_abs), .groups = "drop") |>
  arrange(.sample, -rel_abs) |> 
  group_by(.sample) |>
  mutate(
    score = weighted.mean(taxa |> as.numeric(), rel_abs), 
    tot = sum(rel_abs),
    dom = taxa[1]
    ) |> 
  ungroup() |> 
  arrange(score) |> 
  mutate(.sample = .sample |> fct_inorder())
Code
summarized_rel_abs |> 
  ggplot() +
  aes(x = .sample, y = rel_abs, fill = taxa) +
  geom_col() +
  scale_x_discrete("Samples", breaks = NULL) +
  facet_grid(. ~ SampleType, scales = "free", space = "free") +
  scale_fill_manual(
    "", 
    values = get_taxa_colors(summarized_rel_abs$taxa |> levels()),
    guide = guide_legend(nrow = 5)
    ) +
  theme(
    strip.text.x = element_text(angle = 90, hjust = 0),
    legend.position = "bottom",
  )

Technical metadata

Code
data_plot <-
  technical_metadata |>
  select(-c(UID, Sample, SampleType, Library, Notes, FragmentSize, IndexSet, `Index 1`, `Index 2`, Ext_Lib_Position)) |>
  mutate(across(everything(), as.factor)) |>
  dplyr::rename(
    LibraryPool = `Library Pool`,
    Selected4re_extraction = `Selected4re-extraction`
  )

generate_plot <- function(df, x_var, fill_var) {
  n_levels_fill <- nlevels(df[[fill_var]])
  if (n_levels_fill <= 12) {
    ggplot(df, aes_string(x = x_var, fill = fill_var)) +
      geom_bar(position = "stack") +
      theme_bw() +
      scale_fill_manual(values = brewer.pal(n = n_levels_fill, name = "Set3")) +
      labs(x = x_var, fill = fill_var, y = "Number of samples") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
  } else {
    message(cat("Trop de niveaux : x = ",x_var, "fill =", fill_var))
    invisible(NULL) # Retourne NULL pour que walk() ne tente pas d'imprimer un message
  }
}

factor_vars <- names(data_plot)

combinations <- cross(list(x = factor_vars, fill = factor_vars)) %>%
  keep(~ which(.x$x == factor_vars) < which(.x$fill == factor_vars)) 

# Appliquer la fonction de génération de graphique à chaque combinaison
walk(combinations, ~ print(generate_plot(data_plot, .x$x, .x$fill)))

Trop de niveaux : x =  Ext_Lib_Plate fill = LibraryPool
NULL
Trop de niveaux : x =  Ext_Lib_Plate_ID fill = LibraryPool
NULL
Trop de niveaux : x =  SequencingRun fill = LibraryPool
NULL
Trop de niveaux : x =  DateSequenced fill = LibraryPool
NULL
Trop de niveaux : x =  Lane fill = LibraryPool
NULL

PCoA on counts

Code
dist_matrix <-
  vegdist(
    assay(SE_mg, "counts") |> t(), 
    method = "bray")

pcoa <- wcmdscale(dist_matrix, eig = TRUE)
print(pcoa)
Call: wcmdscale(d = dist_matrix, eig = TRUE)

          Inertia Rank
Total      359.39     
Real       428.63  344
Imaginary  -69.24  621

Results have 966 points, 344 axes

Eigenvalues:
  [1] 60.97 50.39 33.62 21.71 15.98 13.65 12.50 10.89  9.51  8.84  8.08  6.79
 [13]  6.25  6.08  5.51  4.77  4.59  4.39  4.38  4.21  4.07  3.73  3.61  3.32
 [25]  3.15  3.07  2.94  2.81  2.69  2.58  2.46  2.45  2.32  2.27  2.20  2.10
 [37]  2.04  1.96  1.87  1.80  1.71  1.62  1.56  1.54  1.52  1.44  1.40  1.38
 [49]  1.32  1.29  1.24  1.21  1.18  1.17  1.12  1.10  1.08  1.07  1.05  1.03
 [61]  1.00  0.97  0.95  0.93  0.89  0.88  0.87  0.85  0.82  0.80  0.79  0.77
 [73]  0.75  0.74  0.72  0.72  0.71  0.68  0.67  0.66  0.64  0.63  0.62  0.61
 [85]  0.60  0.59  0.58  0.57  0.55  0.54  0.53  0.52  0.51  0.51  0.50  0.49
 [97]  0.48  0.48  0.47  0.46  0.45  0.45  0.44  0.43  0.42  0.42  0.41  0.41
[109]  0.40  0.40  0.39  0.39  0.38  0.38  0.37  0.36  0.36  0.35  0.34  0.34
(Showing 120 of 965 eigenvalues)

Weights: Constant
Code
pcoa$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa$eig/sum(pcoa$eig)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 60.975 16.966
2 50.394 14.022
3 33.616 9.353
4 21.708 6.040
5 15.980 4.446
6 13.648 3.798
7 12.496 3.477
8 10.893 3.031
9 9.509 2.646
10 8.836 2.459
Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x = "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  filter(axis <= 20) |> 
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x = "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_data <- 
  tibble(
    sampleID = rownames(pcoa$points),
    PCoA = pcoa$points
  ) |> 
  left_join(
    SE_mg |> colData() |> as_tibble(),
    by = join_by(sampleID)
  )
Code
pcoa_plot <- function(pcoa_tb, pcoa, color_var, axes = 1:2){
  
  pcoa_tb <- 
    pcoa_tb |> 
    mutate(
      col = as.factor(!!sym(color_var)),
      x = PCoA[, axes[1]],
      y = PCoA[, axes[2]]
      )

  pcoa_tb |>
    ggplot() +
    aes(x = x, y = y, color = col) +
    geom_point(size = 2, alpha = 0.7) +
    coord_fixed() +
    labs(
      x = paste0("PCoA ", axes[1], " (", round(100 * pcoa$eig[axes[1]] / sum(pcoa$eig), 1), "%)"),
      y = paste0("PCoA ", axes[2], " (", round(100 * pcoa$eig[axes[2]] / sum(pcoa$eig), 1), "%)"),
      color = color_var
    ) 
}

Type of sample

Code
sample_type_colors <- c("gray", "dodgerblue", "orange", "black", "red")

pcoa_plot(pcoa_data, pcoa, "SampleType", c(1, 2)) + scale_color_manual(values = sample_type_colors) 

Code
pcoa_plot(pcoa_data, pcoa, "SampleType", c(3, 4)) + scale_color_manual(values = sample_type_colors) 

Code
manova <- adonis2(dist_matrix ~ pcoa_data$SampleType)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ pcoa_data$SampleType)
          Df SumOfSqs      R2     F Pr(>F)    
Model      4     4.42 0.01229 2.989  0.001 ***
Residual 961   354.97 0.98771                 
Total    965   359.39 1.00000                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Extraction Plate

Code
pcoa_plot(pcoa_data, pcoa, "Ext_Lib_Plate", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "Ext_Lib_Plate", c(3, 4))

Permutation test

Code
manova <- adonis2(dist_matrix ~ pcoa_data$Ext_Lib_Plate)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix ~ pcoa_data$Ext_Lib_Plate)
          Df SumOfSqs      R2      F Pr(>F)    
Model      1     4.57 0.01272 12.423  0.001 ***
Residual 964   354.82 0.98728                  
Total    965   359.39 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Bioinformatics Processing Batch

Code
pcoa_plot(pcoa_data, pcoa, "BioinformaticsProcessingBatch", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "BioinformaticsProcessingBatch", c(3, 4))

Library Pool

Code
pcoa_plot(pcoa_data, pcoa, "Library.Pool", c(1, 2))

Code
pcoa_plot(pcoa_data, pcoa, "Library.Pool", c(3, 4))

Lane

Code
pcoa_plot(pcoa_data |> filter(Lane %in% 1:4), pcoa, "Lane", c(1, 2))

Code
pcoa_plot(pcoa_data |> filter(Lane %in% 1:4), pcoa, "Lane", c(3, 4))

PCoA on relative abundances

Code
dist_matrix_relab <-
  vegdist(
    relabs |> dplyr::select(-c(CST, subCST, score)) |> column_to_rownames("sampleID"), 
    method = "bray")

pcoa_relab <- wcmdscale(dist_matrix_relab, eig = TRUE)
print(pcoa_relab)
Call: wcmdscale(d = dist_matrix_relab, eig = TRUE)

          Inertia Rank
Total      309.98     
Real       381.84  326
Imaginary  -71.86  639

Results have 966 points, 326 axes

Eigenvalues:
  [1] 71.59 59.11 39.07 19.66 15.30 12.04 10.41  8.62  7.37  6.53  5.77  5.39
 [13]  5.24  4.78  4.31  4.24  3.83  3.75  3.24  3.15  2.98  2.79  2.68  2.57
 [25]  2.24  2.16  2.08  1.94  1.77  1.73  1.59  1.54  1.53  1.41  1.36  1.35
 [37]  1.28  1.25  1.18  1.16  1.12  1.09  1.05  1.03  0.98  0.96  0.93  0.91
 [49]  0.88  0.87  0.82  0.80  0.79  0.78  0.76  0.75  0.74  0.70  0.69  0.67
 [61]  0.66  0.64  0.63  0.62  0.59  0.56  0.56  0.55  0.54  0.52  0.51  0.49
 [73]  0.49  0.49  0.48  0.47  0.46  0.45  0.44  0.43  0.42  0.41  0.41  0.40
 [85]  0.39  0.38  0.38  0.37  0.36  0.36  0.35  0.35  0.34  0.33  0.33  0.32
 [97]  0.31  0.30  0.30  0.29  0.29  0.28  0.28  0.27  0.26  0.26  0.26  0.26
[109]  0.25  0.24  0.24  0.24  0.24  0.23  0.23  0.23  0.22  0.22  0.21  0.21
(Showing 120 of 965 eigenvalues)

Weights: Constant
Code
pcoa_relab$eig |>
  as.data.frame() |>
  rownames_to_column("axis") |>
  mutate(axis = str_remove(axis, "Eigenvalue"), 
         var_explained = 100*pcoa$eig/sum(pcoa_relab$eig)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(title = "Eigenvalues of the PCoA") |>
  cols_label(axis = "Axis", `pcoa_relab$eig` = "Eigenvalue", var_explained = "Variance explained") |>
  fmt_number(columns = vars(`pcoa_relab$eig`, var_explained), decimals = 3)
Eigenvalues of the PCoA
Axis Eigenvalue Variance explained
1 71.593 19.670
2 59.108 16.257
3 39.074 10.844
4 19.664 7.003
5 15.305 5.155
6 12.043 4.403
7 10.414 4.031
8 8.623 3.514
9 7.367 3.068
10 6.533 2.850
Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_relab$eig |> 
  as_data_frame() |>
  rownames_to_column("axis") |>
  mutate(axis = axis |> as.numeric()) |>
  filter(axis <= 20) |> 
  ggplot(aes(x = axis, y = value)) +
  geom_col() +
  labs(
    x= "Axis",
    y = "Eigenvalue"
  ) 

Code
pcoa_relab_data <- 
  tibble(
    sampleID = rownames(pcoa_relab$points),
    PCoA = pcoa_relab$points
  ) |> 
  left_join(
    SE_mg |> colData() |> as_tibble(),
    by = join_by(sampleID)
  )

Type of sample

Code
sample_type_colors <- c("gray", "dodgerblue", "orange", "black", "red")

pcoa_plot(pcoa_relab_data, pcoa_relab, "SampleType", c(1, 2)) + scale_color_manual(values = sample_type_colors) 

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "SampleType", c(3, 4)) + scale_color_manual(values = sample_type_colors) 

Code
manova <- adonis2(dist_matrix_relab ~ pcoa_relab_data$SampleType)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ pcoa_relab_data$SampleType)
          Df SumOfSqs      R2      F Pr(>F)    
Model      4    3.536 0.01141 2.7725  0.001 ***
Residual 961  306.448 0.98859                  
Total    965  309.984 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Extraction Plate

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "Ext_Lib_Plate", c(1, 2))

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "Ext_Lib_Plate", c(3, 4))

Permutation test

Code
manova <- adonis2(dist_matrix_relab ~ pcoa_relab_data$Ext_Lib_Plate)
manova
Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999

adonis2(formula = dist_matrix_relab ~ pcoa_relab_data$Ext_Lib_Plate)
          Df SumOfSqs      R2      F Pr(>F)    
Model      1    4.345 0.01402 13.704  0.001 ***
Residual 964  305.639 0.98598                  
Total    965  309.984 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Library Pool

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "Library.Pool", c(1, 2))

Code
pcoa_plot(pcoa_relab_data, pcoa_relab, "Library.Pool", c(3, 4))

Lane

Code
pcoa_plot(pcoa_relab_data |> filter(Lane %in% 1:4), pcoa_relab, "Lane", c(1, 2))

Code
pcoa_plot(pcoa_relab_data |> filter(Lane %in% 1:4), pcoa_relab, "Lane", c(3, 4))

Saving SummarizedExperiment objects

We save the SE objects to disk

Code
saveRDS(
  SE_mg, 
  str_c(
    get_output_dir(data_source = data_source),  
    "01_se_mg_", today() |> str_remove_all("-"), ".rds"
    )
  )